{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def F1(predictions, y):\n",
    "\n",
    "    TP = np.sum((predictions == 1) & (y == 1))\n",
    "    FP = np.sum((predictions == 1) & (y == 0))\n",
    "    FN = np.sum((predictions == 0) & (y == 1))\n",
    "    if TP + FP == 0:\n",
    "        precision = 0\n",
    "    else:\n",
    "        precision = float(TP) / (TP + FP)\n",
    "    if TP + FN == 0:\n",
    "        recall = 0\n",
    "    else:\n",
    "        recall = float(TP) / (TP + FN)\n",
    "    if precision + recall == 0:\n",
    "        return 0\n",
    "    else:\n",
    "        return (2.0 * precision * recall) / (precision + recall)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def gaussianModel(X):\n",
    "\n",
    "    # 参数估计\n",
    "    m, n = X.shape\n",
    "    mu = np.mean(X, axis=0)\n",
    "    sigma2 = np.var(X, axis=0)\n",
    "    \n",
    "    def p(x): # x是单个样本，n*1维\n",
    "        total = 1\n",
    "        for j in range(x.shape[0]):\n",
    "            total *= np.exp(-np.power((x[j, 0] - mu[0, j]), 2) / (2 * sigma2[0, j])\n",
    "                            ) / (np.sqrt(2 * np.pi * sigma2[0, j]))\n",
    "        return total\n",
    "    return p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def multivariateGaussianModel(X):\n",
    "\n",
    "    # 参数估计\n",
    "    m, n = X.shape\n",
    "    mu = np.mean(X.T, axis=1)\n",
    "    Sigma = np.mat(np.cov(X.T))\n",
    "    detSigma = np.linalg.det(Sigma)\n",
    "\n",
    "    def p(x):\n",
    "        x = x - mu\n",
    "        return np.exp(-x.T * np.linalg.pinv(Sigma) * x / 2).A[0] * \\\n",
    "            ((2*np.pi)**(-n/2.0) * (detSigma**(-0.5) ))\n",
    "    return p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def train(X, model=gaussianModel):\n",
    "    return model(X) # 返回的是概率模型p"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def selectEpsilon(XVal, yVal, p):\n",
    "    \n",
    "    pVal = np.mat([p(x.T) for x in XVal]).reshape(-1, 1) # 交叉验证集中所有样本的概率\n",
    "    step = (np.max(pVal) - np.min(pVal)) / 1000.0\n",
    "    \n",
    "    bestEpsilon = 0\n",
    "    bestF1 = 0\n",
    "    \n",
    "    for epsilon in np.arange(np.min(pVal), np.max(pVal), step):\n",
    "        predictions = pVal < epsilon\n",
    "        f1 = F1(predictions, yVal)\n",
    "        if f1 > bestF1:\n",
    "            bestF1 = f1\n",
    "            bestEpsilon = epsilon\n",
    "    return bestEpsilon, bestF1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from scipy.io import loadmat\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "plt.rcParams['font.sans-serif']=['SimHei']\n",
    "plt.rcParams['axes.unicode_minus']=False"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 低维数据测试\n",
    "data = loadmat('data/ex8data1.mat')\n",
    "X = np.mat(data['X'])\n",
    "XVal = np.mat(data['Xval'])\n",
    "yVal = np.mat(data['yval'])\n",
    "\n",
    "p = train(X)\n",
    "#p = train(X, model=multivariateGaussianModel)\n",
    "pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1)\n",
    "\n",
    "# 绘制数据点\n",
    "plt.xlabel(u'延迟 (ms)')\n",
    "plt.ylabel(u'吞吐 (mb/s)')\n",
    "plt.plot(X[:, 0], X[:, 1], 'bx')\n",
    "epsilon, f1 = selectEpsilon(XVal, yVal, p)\n",
    "\n",
    "print u'基于交叉验证集最佳ε: %e\\n'%epsilon\n",
    "print u'基于交叉验证集最佳F1:  %f\\n'%f1\n",
    "print u'找到 %d 个异常点' % np.sum(pTest < epsilon)\n",
    "\n",
    "# 获得训练集的异常点\n",
    "outliers = np.where(pTest < epsilon, True, False).ravel()\n",
    "plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1)\n",
    "n = np.linspace(0, 35, 100)\n",
    "X1 = np.meshgrid(n,n)\n",
    "XFit = np.mat(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten())))\n",
    "pFit = np.mat([p(x.T) for x in XFit]).reshape(-1, 1)\n",
    "pFit = pFit.reshape(X1[0].shape)\n",
    "\n",
    "if not np.isinf(np.sum(pFit)):\n",
    "    plt.contour(X1[0], X1[1], pFit, 10.0**np.arange(-20, 0, 3).T)\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 高维数据\n",
    "data = loadmat('data/ex8data2.mat')\n",
    "X = np.mat(data['X'])\n",
    "XVal = np.mat(data['Xval'])\n",
    "yVal = np.mat(data['yval'])\n",
    "\n",
    "p = train(X)\n",
    "#p = train(X, model=multivariateGaussianModel)\n",
    "pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1)\n",
    "\n",
    "epsilon, f1 = selectEpsilon(XVal, yVal, p)\n",
    "\n",
    "print 'Best epsilon found using cross-validation: %e\\n'%epsilon\n",
    "print 'Best F1 on Cross Validation Set:  %f\\n'%f1\n",
    "print '# Outliers found: %d' % np.sum(pTest < epsilon)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
